/* 
Creates tables comparing the analysis sample to 2014 RAIS establishmemts
	// input: estabchars_wide_full, estabtreat_forRAIS_deid, estab_union_act, estlevel_dataset
	// output: TableA6.xlsx, TableA7.xlsx
*/ 

cap log close
cap log using "$logs/sample_descriptives_rais_log", replace


*** Get amenities sample
	use "$files/estab_union_act.dta", clear
	keep if inrange(year, 2012, 2017)
	keep employer_id
	duplicates drop 
	
	 
	// merge cnpj to fake establishment ids 
	gen cnpj=subinstr(employer_id, ".", "", .)
	replace cnpj=subinstr(cnpj, "/", "", .)
	replace cnpj=subinstr(cnpj, "-", "", .)
	gen h = length(cnpj)
	replace cnpj = "00"+cnpj if h==12
	drop h 
	destring cnpj, gen(fakeid_estab) 
	format  fakeid_estab %14.0f
	keep fakeid_estab
	duplicates drop 
	gen amenities_s = 1 
	tempfile ams 
	sa `ams'

*** Get estab sample
	use "$files/estlevel_dataset.dta", clear

	* data toggles
	local singest = 0 //0: no restriction; 1: must be a single establishment firm
	local bothFM = 1 //0: no restriction; 1: must employ both men and women at basline
	local covered = 1 //0: no restriction; 1: must be in the geographic coverage of the CBA
	local signing = 1 //0: no restriction; 1: must be a signing establishment
	local geoglevel = "microregion_mode" //"": for no geog-yr FES; "state_mode": for state-yr FEs; "microregion_mode": for microregion-yr FEs
	local no2011 = 1 //0: include 2011; 1: exclude 2011
	local balanced = 0 //0: no restriction; 1: restrict to the balanced panel
	
	* sample restrictions
	if `singest' {
		keep if bl_singest==1
	}
	if `bothFM' {
		keep if bl_bothFM==1
	}
	if `covered' {
		keep if (bl_covered==1)
	}
	if `signing' {
		keep if (bl_signing==1)
	}
	if `balanced' {
		egen xxx = max(exit), by(fakeid_estab)
		keep if xxx==0
		drop xxx
	}
	keep fakeid_estab
	duplicates drop 
	merge 1:1 fakeid_estab using `ams', keep(1 3) 
	drop _merge 
	gen estab_s = 1 
	tempfile ess
	sa `ess'

**# CHECK TREAT IS THERE AND NON-MISSING?
	use "$files/estabtreat_forRAIS_deid.dta", clear 
	keep fakeid_estab treat
	merge 1:1 fakeid_estab using `ams', gen(_m_ams) 
	assert _m_ams == 3 | _m_ams==1
	drop _m_ams 
	merge 1:1 fakeid_estab using `ess', gen(_m_ess) 
	assert _m_ess ==1 | _m_ess==3 
	drop _m_ess 
	tempfile samples 
	sa `samples'


*** All RAIS establishments with characteristics 
use "$files/estabchars_wide_full.dta", clear 

// get the fakeid firm
format fakeid_estab %20.0f 
tostring fakeid_estab, gen(fakeid_estab_s) force format(%20.0f) 
gen lenid = length(fakeid_estab_s)
qui su lenid 
local max = r(max) 
gen newid = (`max'-lenid)*"0" + fakeid_estab_s
drop lenid 
gen lenid = length(newid)
qui su lenid 
local max = r(max) 
gen fakeid_firm = substr(newid, 1, `max'-6)
destring fakeid_firm, replace 

// merge with our amenities and estab samples 
cap drop _merge 
merge 1:1 fakeid_estab using `samples',  keep(1 3) keepusing(amenities_s estab_s treat)
replace amenities_s = 0 if amenities_s==.
replace estab_s = 0 if estab_s==.

// reshape long & keep only 2014 
keep if has_2014 == 1 
reshape long has_ share_femmanager tot_employment  tot_fememployment, i(fakeid_estab) j(year) 
keep if year == 2014 
isid fakeid_estab

// drop if 0 employment in 2014 
drop if tot_employment==0 | tot_employment==.

// gen share women 
replace tot_fememployment = 0 if tot_fememployment == . 
gen sharewomen = tot_fememployment/tot_employment

// indicator for employs both men and women 
gen has_both = (tot_fememployment>0 & tot_fememployment!=tot_employment) 

// group sectors for table 
gen ind2d = floor(ind_mode/1000)
gen sector = "" 
replace sector = "Agriculture & Extraction" if inrange(ind2d, 1, 9)
replace sector = "Manufacturing" if inrange(ind2d, 10, 33)
replace sector = "Construction & utilities" if inrange(ind2d, 35, 43)
replace sector = "Commerce" if inrange(ind2d, 45, 47)
replace sector = "Services" if inrange(ind2d, 49, 99)
assert sector !=""
tab sector, gen(S_)	

// regions indicators
// state in string version 
gen xxx = ""
replace xxx = "RO" if state_mode==11
replace xxx = "AC" if state_mode==12
replace xxx = "AM" if state_mode==13
replace xxx = "RR" if state_mode==14
replace xxx = "PA" if state_mode==15
replace xxx = "AP" if state_mode==16
replace xxx = "TO" if state_mode==17
replace xxx = "MA" if state_mode==21
replace xxx = "PI" if state_mode==22
replace xxx = "CE" if state_mode==23
replace xxx = "RN" if state_mode==24
replace xxx = "PB" if state_mode==25
replace xxx = "PE" if state_mode==26
replace xxx = "AL" if state_mode==27
replace xxx = "SE" if state_mode==28
replace xxx = "BA" if state_mode==29
replace xxx = "MG" if state_mode==31
replace xxx = "ES" if state_mode==32
replace xxx = "RJ" if state_mode==33
replace xxx = "SP" if state_mode==35
replace xxx = "PR" if state_mode==41
replace xxx = "SC" if state_mode==42
replace xxx = "RS" if state_mode==43
replace xxx = "MS" if state_mode==50
replace xxx = "MT" if state_mode==51
replace xxx = "GO" if state_mode==52
replace xxx = "DF" if state_mode==53
drop state_mode
rename xxx state_mode
encode state_mode, gen(state_mode_num)
gen region = "North" if inlist(state_mode, "AM", "AC", "RO", "RR", "AP", "PA", "TO")
replace region = "Northeast" if inlist(state_mode, "MA", "PI", "CE", "RN", "PB", "PE", "AL", "SE", "BA")
replace region = "Central" if inlist(state_mode, "MT", "GO", "DF", "MS")
replace region = "Southeast" if inlist(state_mode, "MG", "ES", "SP", "RJ")
replace region = "South" if inlist(state_mode, "PR", "SC", "RS")
encode region, gen(region_num)
tab region, gen(R_)
	
// indicator for 1-person firms 
gen oneperson_firm = tot_employment==1

// employer size 
gen employer_size = 2 if inrange(tot_employment, 1, 4)
replace employer_size = 3 if inrange(tot_employment, 5, 9)
replace employer_size = 4 if inrange(tot_employment, 10, 19)
replace employer_size = 5 if inrange(tot_employment, 20, 49)
replace employer_size = 6 if inrange(tot_employment, 50, 99)
replace employer_size = 7 if inrange(tot_employment, 100, 249)
replace employer_size = 8 if inrange(tot_employment, 250, 499)
replace employer_size = 9 if inrange(tot_employment, 500, 999)
replace employer_size = 10 if tot_employment>=1000 & tot_employment!=. 
label define size 1 "None" 2 "1-4" 3 "5-9" 4 "10-19" 5 "20-49" ///
6 "50-99" 7 "100-249" 8 "250-499" 9 "500-999" 10 "+1000", replace
label values employer_size size

// gen an indicator for belonging to a single establishment firm, based on estab in 2014
bys fakeid_firm: gen N_estab = _N 
gen singest = (N_estab == 1)
	
// gen share of workforce in our sample 
egen totalworkforce2014 = total(tot_employment)
egen totalworkforce2014_hasboth = total(tot_employment*has_both) // tot workforce in estab employing both F&M 

bys amenities_s: egen totalworkforce2014_inams = total(tot_employment)
bys estab_s: egen totalworkforce2014_iness = total(tot_employment)

bys amenities_s treat: egen totalworkforce2014_inams_byT = total(tot_employment)
bys estab_s treat: egen totalworkforce2014_iness_byT = total(tot_employment)

gen share_inams = totalworkforce2014_inams/totalworkforce2014
gen share_iness = totalworkforce2014_iness/totalworkforce2014_hasboth

gen share_hasboth = totalworkforce2014_hasboth/totalworkforce2014
	
// labels 
label var tot_employment "Size"
label var sharewomen "Share women"
label var has_both "Employs both men and women"
label var singest "Single establishment firm"
label var oneperson_firm "Single person firm"

label var R_1 "Central"
label var R_2 "North"
label var R_3 "Northeast"
label var R_4 "South"
label var R_5 "Southeast"

label var S_1 "Agriculture & Extraction"
label var S_2 "Commerce"
label var S_3 "Construction & utilities"
label var S_4 "Manufacturing"
label var S_5 "Services"

// broad industries
gen cnae = floor(cnaesubcl_mode/1e5)
gen industry = .
replace industry=1 if cnae>=1 & cnae<5 // %farming and fishing 
replace industry=2 if cnae>=5 & cnae<10 // %extractive industries
replace industry=3 if cnae>=10 & cnae<35 // %manufacturing
replace industry=4 if cnae>=35 & cnae<41 // %electricity/gas/utilitiesv
replace industry=5 if cnae>=41 & cnae<45 // %construction
replace industry=6 if cnae>=45 & cnae<49 // %trade
replace industry=7 if cnae>=49 & cnae<55 // %transportation and warehousing
replace industry=8 if cnae>=55 & cnae<58 // %accomodation and food
replace industry=9 if cnae>=58 & cnae<64 // %information and communication
replace industry=10 if cnae>=64 & cnae<68 // %banking and finance
replace industry=11 if cnae>=68 & cnae<69 // %real estate
replace industry=12 if cnae>=69 & cnae<77 // %professional activities
replace industry=13 if cnae>=77 & cnae<84 // %administrative activities
replace industry=14 if cnae>=84 & cnae<85 // %public administration
replace industry=15 if cnae>=85 & cnae<86 // %education
replace industry=16 if cnae>=86 & cnae<90 // %health
replace industry=17 if cnae>=90 & cnae<94 // %culture and sports
replace industry=18 if cnae>=94 & cnae<100 // %other services and organizations
drop cnae
label define ind 1 "Farm" 2 "Extract" 3 "Manuf" 4 "Utilities" 5 "Constr" 6 "Trade" ///
			7 "Transp" 8 "Hosp" 9 "Communic" 10 "Banking" 11 "Real estate" 12 "Profess" ///
			13 "Admin" 14 "Public" 15 "Educ" 16 "Health" 17 "Culture" 18 "Other", replace
label values industry ind
	

**********************************************
*** TABLE WITH STATISTICS COMPARING SAMPLES***
**********************************************

gen Var_name = ""

gen Rais_mean = . 
gen Rais_sd =  . 

gen amenities_s_mean = . 
gen amenities_s_sd = . 
gen amenities_s_p = . 
gen amenities_s_t = . 

gen Rais_hasboth_mean = . 
gen Rais_hasboth_sd =  . 

gen estab_s_mean = . 
gen estab_s_sd = . 
gen estab_s_p = . 
gen estab_s_t = . 

gen order = _n 
sort order 

local tosumm tot_employment sharewomen has_both oneperson_firm singest S_1 S_4 S_3 S_2 S_5 R_2 R_3 R_1 R_4 R_5 

local r = 1 

foreach var of local tosumm {
	
	replace Var_name = "`: var label `var''" in `r' 
	
	su `var' 
	replace Rais_mean = r(mean) in `r' 
	replace Rais_sd = r(sd) in `r' 
	
	su `var' if amenities_s==1
	replace amenities_s_mean = r(mean) in `r' 
	replace amenities_s_sd = r(sd) in `r' 
	ttest `var', by(amenities_s)
	replace amenities_s_p = r(p) in `r' 
	replace amenities_s_t = r(t) in `r' 
	
	
	su `var' if has_both == 1
	replace Rais_hasboth_mean = r(mean) in `r' 
	replace Rais_hasboth_sd = r(sd) in `r' 
	
	
	su `var' if estab_s==1
	replace estab_s_mean = r(mean) in `r' 
	replace estab_s_sd = r(sd) in `r' 
	ttest `var' if has_both == 1, by(estab_s)
	replace estab_s_p = r(p) in `r' 
	replace estab_s_t = r(t) in `r'
	
	local ++r
}

// counts of estab 
replace Var_name = "N establishments" in `r' 

qui count 
replace Rais_mean = r(N) in `r' 

qui count if amenities_s==1
replace amenities_s_mean = r(N) in `r' 

qui count if has_both == 1
replace Rais_hasboth_mean = r(N) in `r' 

qui count if estab_s==1
replace estab_s_mean = r(N) in `r' 

local ++r 

// counts of workers  
replace Var_name = "N workers" in `r' 

qui su totalworkforce2014
replace Rais_mean = r(mean) in `r' 

qui su totalworkforce2014_inams if amenities_s == 1  
replace amenities_s_mean = r(mean) in `r' 

qui su totalworkforce2014_inams if has_both == 1  
replace Rais_hasboth_mean = r(mean) in `r' 

qui su totalworkforce2014_iness if estab_s == 1
replace estab_s_mean = r(mean) in `r' 

local ++r 

// % total workforce 
replace Var_name = "% of workforce" in `r' 	

replace Rais_mean = 1 in `r'  
qui su totalworkforce2014
local tot = r(mean)

qui su totalworkforce2014_inams if has_both == 1  
local totboth = r(mean)
replace Rais_hasboth_mean = r(mean)/`tot' in `r'

qui su totalworkforce2014_inams if amenities_s == 1 // this is share wrt to rais
replace amenities_s_mean = r(mean)/`tot' in `r'

qui su totalworkforce2014_iness if estab_s == 1 // this is share wrt to rais (use totboth to restrict to RAIS with both F & M) 
replace estab_s_mean = r(mean)/`tot' in `r'

preserve 
	keep if Var_name!="" 
	sort order 
	
	keep Var_name Rais_* amenities_s_* estab_s_*
	
	export excel using "$tables/TableA6.xlsx",  sheetreplace  firstrow(variables)
restore 

	
*********************************************************
*** TABLE WITH STATISTICS COMPARING TREAT AND CONTROL ***
*********************************************************

drop Var_name amenities_s_p amenities_s_t estab_s_p estab_s_t order
gen Var_name = ""

gen amenities_s_T_mean = . 
gen amenities_s_T_sd = . 
gen amenities_s_C_mean = . 
gen amenities_s_C_sd = . 

gen amenities_s_p = . 
gen amenities_s_t = . 

gen estab_s_T_mean = . 
gen estab_s_T_sd = . 
gen estab_s_C_mean = . 
gen estab_s_C_sd = . 

gen estab_s_p = . 
gen estab_s_t = . 

gen order = _n 
sort order 

local tosumm tot_employment sharewomen has_both oneperson_firm singest S_1 S_4 S_3 S_2 S_5 R_2 R_3 R_1 R_4 R_5 

local r = 1 

foreach var of local tosumm {
	
	replace Var_name = "`: var label `var''" in `r' 
   
	su `var' if amenities_s==1 & treat == 1
	replace amenities_s_T_mean = r(mean) in `r' 
	replace amenities_s_T_sd = r(sd) in `r' 
	
	su `var' if amenities_s==1 & treat == 0
	replace amenities_s_C_mean = r(mean) in `r' 
	replace amenities_s_C_sd = r(sd) in `r' 
	
	ttest `var' if amenities_s==1, by(treat)
	replace amenities_s_p = r(p) in `r' 
	replace amenities_s_t = r(t) in `r' 
	
	su `var' if estab_s==1 & treat == 1
	replace estab_s_T_mean = r(mean) in `r' 
	replace estab_s_T_sd = r(sd) in `r' 
	
	su `var' if estab_s==1 & treat == 0
	replace estab_s_C_mean = r(mean) in `r' 
	replace estab_s_C_sd = r(sd) in `r' 
	
	ttest `var' if estab_s==1, by(treat)
	replace estab_s_p = r(p) in `r' 
	replace estab_s_t = r(t) in `r'
	
	local ++r
}

// counts of estab 
replace Var_name = "N establishments" in `r' 

qui count 

qui count if amenities_s==1 & treat == 1
replace amenities_s_T_mean = r(N) in `r' 
qui count if amenities_s==1 & treat == 0
replace amenities_s_C_mean = r(N) in `r' 

qui count if estab_s==1 & treat == 1
replace estab_s_T_mean = r(N) in `r' 
qui count if estab_s==1 & treat == 0
replace estab_s_C_mean = r(N) in `r' 

local ++r 

// counts of workers  
replace Var_name = "N workers" in `r' 

qui su totalworkforce2014_inams_byT if amenities_s==1 & treat == 1
replace amenities_s_T_mean = r(mean) in `r' 
qui su totalworkforce2014_inams_byT if amenities_s==1 & treat == 0
replace amenities_s_C_mean = r(mean) in `r' 

qui su totalworkforce2014_iness_byT if estab_s==1 & treat == 1
replace estab_s_T_mean = r(mean) in `r' 
qui su totalworkforce2014_iness_byT if estab_s==1 & treat == 0
replace estab_s_C_mean = r(mean) in `r' 

local ++r 

keep if Var_name!="" 
sort order 

keep Var_name  amenities_s_T_* amenities_s_C_* amenities_s_p amenities_s_t estab_s_T_* estab_s_C_* estab_s_p estab_s_t

export excel using "$tables/TableA7.xlsx",  sheetreplace  firstrow(variables)

	
log close
